{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5cd3b160-b6fc-4240-bccd-1f0ffe1a77ee",
   "metadata": {},
   "source": [
    "# 第六节、数据清洗"
   ]
  },
  {
   "cell_type": "code",
   "id": "76f39886-f41c-4e87-b0dd-ea5e35257ffc",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.141899Z",
     "start_time": "2025-06-20T05:48:23.123901Z"
    }
   },
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ],
   "outputs": [],
   "execution_count": 3
  },
  {
   "cell_type": "code",
   "id": "c945e77c-e686-48bb-aaf2-45cf62d1329e",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.188263Z",
     "start_time": "2025-06-20T05:48:23.172026Z"
    }
   },
   "source": [
    "data = {\n",
    "    'color': ['red','blue','red','green','blue',None,'red'],\n",
    "    'price': [10, 20, 10, 15, 20, 0, np.nan],\n",
    "}\n",
    "\n",
    "df = pd.DataFrame(\n",
    "    data=data\n",
    ")\n",
    "\n",
    "df"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 4
  },
  {
   "cell_type": "markdown",
   "id": "7057566a-919f-469d-9b3a-55b8997d40b5",
   "metadata": {},
   "source": [
    "## （1）重复值处理"
   ]
  },
  {
   "cell_type": "code",
   "id": "c5fc7a33-b551-45b2-9afb-fb263ff9553f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.230492Z",
     "start_time": "2025-06-20T05:48:23.217438Z"
    }
   },
   "source": [
    "# 判断是否存在重复数据\n",
    "df.duplicated()"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2     True\n",
       "3    False\n",
       "4     True\n",
       "5    False\n",
       "6    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 5
  },
  {
   "cell_type": "code",
   "id": "0043b907-832e-4c5f-99b2-2d014f5c6d30",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.253506Z",
     "start_time": "2025-06-20T05:48:23.242332Z"
    }
   },
   "source": [
    "# 删除重复数据\n",
    "df.drop_duplicates()"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "3  green   15.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 6
  },
  {
   "cell_type": "code",
   "id": "de188ce5-ff3a-4af7-a3d7-877403e1ea50",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.284565Z",
     "start_time": "2025-06-20T05:48:23.274316Z"
    }
   },
   "source": [
    "# 如果不给定inplace=True，那么原来的数据并不会被改变\n",
    "df"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 7
  },
  {
   "cell_type": "markdown",
   "id": "7de09dde-2f30-47e4-a04c-caf9d378936e",
   "metadata": {},
   "source": [
    "## （2）缺失值处理"
   ]
  },
  {
   "cell_type": "code",
   "id": "de0b847f-8061-4360-bc38-214488e9e331",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.317192Z",
     "start_time": "2025-06-20T05:48:23.308620Z"
    }
   },
   "source": [
    "# 空数据过滤\n",
    "# 判断是否存在空数据\n",
    "df.isnull()    # None 和 np.NaN 都会被认作空数据"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color  price\n",
       "0  False  False\n",
       "1  False  False\n",
       "2  False  False\n",
       "3  False  False\n",
       "4  False  False\n",
       "5   True  False\n",
       "6  False   True"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 8
  },
  {
   "cell_type": "code",
   "id": "3742e52f-7cf4-40c1-b264-f84e098f5fc5",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.349100Z",
     "start_time": "2025-06-20T05:48:23.339153Z"
    }
   },
   "source": [
    "# 删除空数据\n",
    "df.dropna()"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 9
  },
  {
   "cell_type": "code",
   "id": "087081d7-1b14-4d72-b98e-1373be477b24",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.403903Z",
     "start_time": "2025-06-20T05:48:23.392356Z"
    }
   },
   "source": [
    "# 删除空数据，指定 how 参数告诉Pandas如何删除\n",
    "df.dropna(how='any')  # 只有一行中有一列是空，就删掉一整行"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 10
  },
  {
   "cell_type": "code",
   "id": "e49e0f8a-c69b-4158-adbf-a95b4c913e93",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.458063Z",
     "start_time": "2025-06-20T05:48:23.447312Z"
    }
   },
   "source": [
    "# 填充空数据\n",
    "# 上面直接删除的做法显然太过于极端，如果数据本就不多，这样的删除操作会带来样本的大量减少\n",
    "# 所以填充空数据也是一种处理方法\n",
    "df.fillna(value=0)   # 缺失的地方用value来填补"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5      0    0.0\n",
       "6    red    0.0"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 11
  },
  {
   "cell_type": "markdown",
   "id": "fb4035c4-7b57-4639-8bae-fdf484f64c14",
   "metadata": {},
   "source": [
    "## （3）指定行或者列过滤"
   ]
  },
  {
   "cell_type": "code",
   "id": "c525ecc7-c1e0-487f-8a44-546da8a5221a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.484651Z",
     "start_time": "2025-06-20T05:48:23.472807Z"
    }
   },
   "source": [
    "df"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 12
  },
  {
   "cell_type": "code",
   "id": "134ceaea-6371-413e-8c8a-e19e812d043f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.514915Z",
     "start_time": "2025-06-20T05:48:23.509344Z"
    }
   },
   "source": [
    "# 直接删除某列\n",
    "# 方式一\n",
    "del df['color']"
   ],
   "outputs": [],
   "execution_count": 13
  },
  {
   "cell_type": "code",
   "id": "4f8f0098-460c-444f-a0a2-9723963b33c5",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.545544Z",
     "start_time": "2025-06-20T05:48:23.533936Z"
    }
   },
   "source": [
    "df  # del关键字来删除，直接改变了原始的df"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   price\n",
       "0   10.0\n",
       "1   20.0\n",
       "2   10.0\n",
       "3   15.0\n",
       "4   20.0\n",
       "5    0.0\n",
       "6    NaN"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 14
  },
  {
   "cell_type": "code",
   "id": "0b0ed490-ff1f-4272-abd2-ccfd041d7b7f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.589986Z",
     "start_time": "2025-06-20T05:48:23.579846Z"
    }
   },
   "source": [
    "# 把数据先恢复一下\n",
    "df = pd.DataFrame(\n",
    "    data=data\n",
    ")\n",
    "\n",
    "df"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 15
  },
  {
   "cell_type": "code",
   "id": "8183ef00-79e0-4e71-b2ae-33a0f5dd8d25",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.625688Z",
     "start_time": "2025-06-20T05:48:23.615976Z"
    }
   },
   "source": [
    "# 指定要删除的列\n",
    "df.drop(labels=['price'], axis=1)"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color\n",
       "0    red\n",
       "1   blue\n",
       "2    red\n",
       "3  green\n",
       "4   blue\n",
       "5   None\n",
       "6    red"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 16
  },
  {
   "cell_type": "code",
   "id": "a3e4c1da-f46a-4f7a-b5ab-9062890666de",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.673582Z",
     "start_time": "2025-06-20T05:48:23.664653Z"
    }
   },
   "source": [
    "df  # 使用drop来删除，是不会直接修改原df"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color  price\n",
       "0    red   10.0\n",
       "1   blue   20.0\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "5   None    0.0\n",
       "6    red    NaN"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>None</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 17
  },
  {
   "cell_type": "code",
   "id": "cd90afb3-c764-438b-9b72-8d37a05ab1f3",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.702356Z",
     "start_time": "2025-06-20T05:48:23.693827Z"
    }
   },
   "source": [
    "# 删除指定的行\n",
    "df.drop(labels=[0, 1, 5], axis=0)"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   color  price\n",
       "2    red   10.0\n",
       "3  green   15.0\n",
       "4   blue   20.0\n",
       "6    red    NaN"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>green</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>blue</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>red</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 18
  },
  {
   "cell_type": "markdown",
   "id": "1eff1a15-036d-4c7a-aa10-a1dc5c7baf29",
   "metadata": {},
   "source": [
    "## （4）函数filter的使用"
   ]
  },
  {
   "cell_type": "code",
   "id": "de52f7c6-01d0-430f-be8c-8b0fb697845b",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.731712Z",
     "start_time": "2025-06-20T05:48:23.720254Z"
    }
   },
   "source": [
    "df = pd.DataFrame(\n",
    "    data=np.array([[3, 7, 1], [2, 8, 256]]),\n",
    "    index=['dog', 'cat'],\n",
    "    columns=['China', 'America', 'France']\n",
    ")\n",
    "\n",
    "df"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     China  America  France\n",
       "dog      3        7       1\n",
       "cat      2        8     256"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>China</th>\n",
       "      <th>America</th>\n",
       "      <th>France</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dog</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat</th>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "      <td>256</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 19
  },
  {
   "cell_type": "code",
   "id": "05acc9dd-00f2-47e8-8a62-e80c7bf0db90",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.766477Z",
     "start_time": "2025-06-20T05:48:23.754393Z"
    }
   },
   "source": [
    "# 这都有点不像是清洗数据，这应该是过滤查询找出想要的数据\n",
    "df.filter(items=['China', 'France'])   # items是指定待查询的列名"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     China  France\n",
       "dog      3       1\n",
       "cat      2     256"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>China</th>\n",
       "      <th>France</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dog</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat</th>\n",
       "      <td>2</td>\n",
       "      <td>256</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 20
  },
  {
   "cell_type": "code",
   "id": "07ad8677-b970-4d6e-8cb3-e227de6cbaca",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.805336Z",
     "start_time": "2025-06-20T05:48:23.796883Z"
    }
   },
   "source": [
    "df  # filter函数不会改变原df"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     China  America  France\n",
       "dog      3        7       1\n",
       "cat      2        8     256"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>China</th>\n",
       "      <th>America</th>\n",
       "      <th>France</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dog</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat</th>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "      <td>256</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 21
  },
  {
   "cell_type": "code",
   "id": "fdb7f444-8c3f-46f9-af0a-926e4465c751",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.829165Z",
     "start_time": "2025-06-20T05:48:23.818779Z"
    }
   },
   "source": [
    "# 根据正则表达式删除列标签\n",
    "df.filter(regex='a$', axis=1)   # a结尾的列，axis=1表示列"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     China  America\n",
       "dog      3        7\n",
       "cat      2        8"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>China</th>\n",
       "      <th>America</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dog</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cat</th>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 22
  },
  {
   "cell_type": "code",
   "id": "398a5aec-0641-4083-8bc0-7f6caa634fdd",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.869108Z",
     "start_time": "2025-06-20T05:48:23.857460Z"
    }
   },
   "source": [
    "# 选择行中包含og的行记录\n",
    "df.filter(like='og', axis=0)   # Like是模糊查询的意思，axis=0表示在行中操作"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     China  America  France\n",
       "dog      3        7       1"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>China</th>\n",
       "      <th>America</th>\n",
       "      <th>France</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dog</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 23
  },
  {
   "cell_type": "markdown",
   "id": "f6d4e93d-4d45-4f6f-be30-712931e7b1a2",
   "metadata": {},
   "source": [
    "## （5）异常值处理"
   ]
  },
  {
   "cell_type": "code",
   "id": "e3e79de7-f94a-4ce5-90d4-4f926529c543",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.905605Z",
     "start_time": "2025-06-20T05:48:23.894087Z"
    }
   },
   "source": [
    "df2 = pd.DataFrame(\n",
    "    data=np.random.randn(10000, 3)   # 生成标准正态分布数据\n",
    ")\n",
    "df2"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "             0         1         2\n",
       "0    -1.170869 -0.003572  0.641649\n",
       "1    -0.782275 -1.015805 -0.866671\n",
       "2     0.086411  1.582896 -1.368608\n",
       "3     1.319277 -1.072977 -0.619248\n",
       "4     0.710679 -1.026320 -0.401833\n",
       "...        ...       ...       ...\n",
       "9995  0.039600  1.374489 -0.350555\n",
       "9996  0.866373  2.376633 -0.192814\n",
       "9997 -0.563033 -1.212696 -0.111150\n",
       "9998 -0.209867  1.462870 -1.271430\n",
       "9999  0.455442 -0.281785  0.341316\n",
       "\n",
       "[10000 rows x 3 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.170869</td>\n",
       "      <td>-0.003572</td>\n",
       "      <td>0.641649</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.782275</td>\n",
       "      <td>-1.015805</td>\n",
       "      <td>-0.866671</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.086411</td>\n",
       "      <td>1.582896</td>\n",
       "      <td>-1.368608</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.319277</td>\n",
       "      <td>-1.072977</td>\n",
       "      <td>-0.619248</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.710679</td>\n",
       "      <td>-1.026320</td>\n",
       "      <td>-0.401833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>0.039600</td>\n",
       "      <td>1.374489</td>\n",
       "      <td>-0.350555</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>0.866373</td>\n",
       "      <td>2.376633</td>\n",
       "      <td>-0.192814</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>-0.563033</td>\n",
       "      <td>-1.212696</td>\n",
       "      <td>-0.111150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>-0.209867</td>\n",
       "      <td>1.462870</td>\n",
       "      <td>-1.271430</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>0.455442</td>\n",
       "      <td>-0.281785</td>\n",
       "      <td>0.341316</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 3 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 24
  },
  {
   "cell_type": "code",
   "id": "4c1e5128-45db-49f3-9a0e-5afd3602f4f8",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.944965Z",
     "start_time": "2025-06-20T05:48:23.935074Z"
    }
   },
   "source": [
    "# 使用3σ原则，来定义异常值\n",
    "sigma = df2.std()\n",
    "sigma"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.990442\n",
       "1    1.008342\n",
       "2    0.995393\n",
       "dtype: float64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 25
  },
  {
   "cell_type": "markdown",
   "id": "f5255c67-7591-45bf-ae66-2634a2a5ea1f",
   "metadata": {},
   "source": [
    "定义异常值x：$-3\\sigma < x < 3\\sigma$"
   ]
  },
  {
   "cell_type": "code",
   "id": "9c4dfa7e-dd78-403a-9e61-2422dd528f94",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:23.971995Z",
     "start_time": "2025-06-20T05:48:23.959859Z"
    }
   },
   "source": [
    "np.abs(df2) > 3*sigma"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "          0      1      2\n",
       "0     False  False  False\n",
       "1     False  False  False\n",
       "2     False  False  False\n",
       "3     False  False  False\n",
       "4     False  False  False\n",
       "...     ...    ...    ...\n",
       "9995  False  False  False\n",
       "9996  False  False  False\n",
       "9997  False  False  False\n",
       "9998  False  False  False\n",
       "9999  False  False  False\n",
       "\n",
       "[10000 rows x 3 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 3 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 26
  },
  {
   "cell_type": "code",
   "id": "ce8e3ea4-387d-4622-9ad3-87df3948e23d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:24.000460Z",
     "start_time": "2025-06-20T05:48:23.990911Z"
    }
   },
   "source": [
    "# 只要有一个是True那就是True，\n",
    "# 意思就是这一行数据，只要有一列出现了异常值，那么这一行数据都算是异常数据。\n",
    "cond = (np.abs(df2) > 3*sigma).any(axis=1) \n",
    "cond"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       False\n",
       "1       False\n",
       "2       False\n",
       "3       False\n",
       "4       False\n",
       "        ...  \n",
       "9995    False\n",
       "9996    False\n",
       "9997    False\n",
       "9998    False\n",
       "9999    False\n",
       "Length: 10000, dtype: bool"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 27
  },
  {
   "cell_type": "code",
   "id": "06d645bc-3ae1-443c-becd-bf683cdd582a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:24.024448Z",
     "start_time": "2025-06-20T05:48:24.014362Z"
    }
   },
   "source": [
    "# 找到异常值\n",
    "df2[cond]"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "             0         1         2\n",
       "25   -1.367504 -3.485163  1.054120\n",
       "112  -2.062689  1.409998  3.033198\n",
       "255  -3.061884  0.977233  0.847411\n",
       "485  -1.294843  3.104825  0.037028\n",
       "708  -3.359639 -0.094490  0.542494\n",
       "...        ...       ...       ...\n",
       "8980  3.138709  0.343523  0.444321\n",
       "9028 -0.612501  0.591496 -3.021160\n",
       "9184 -1.329859 -3.241308 -1.437456\n",
       "9534 -0.463268  3.593391  0.350249\n",
       "9654  3.294832 -0.750633 -0.221132\n",
       "\n",
       "[74 rows x 3 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>-1.367504</td>\n",
       "      <td>-3.485163</td>\n",
       "      <td>1.054120</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>-2.062689</td>\n",
       "      <td>1.409998</td>\n",
       "      <td>3.033198</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>255</th>\n",
       "      <td>-3.061884</td>\n",
       "      <td>0.977233</td>\n",
       "      <td>0.847411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>485</th>\n",
       "      <td>-1.294843</td>\n",
       "      <td>3.104825</td>\n",
       "      <td>0.037028</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>708</th>\n",
       "      <td>-3.359639</td>\n",
       "      <td>-0.094490</td>\n",
       "      <td>0.542494</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8980</th>\n",
       "      <td>3.138709</td>\n",
       "      <td>0.343523</td>\n",
       "      <td>0.444321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9028</th>\n",
       "      <td>-0.612501</td>\n",
       "      <td>0.591496</td>\n",
       "      <td>-3.021160</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9184</th>\n",
       "      <td>-1.329859</td>\n",
       "      <td>-3.241308</td>\n",
       "      <td>-1.437456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9534</th>\n",
       "      <td>-0.463268</td>\n",
       "      <td>3.593391</td>\n",
       "      <td>0.350249</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9654</th>\n",
       "      <td>3.294832</td>\n",
       "      <td>-0.750633</td>\n",
       "      <td>-0.221132</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>74 rows × 3 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 28
  },
  {
   "cell_type": "code",
   "id": "86fd7309-293c-4868-bb5c-efa423da79ce",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:24.050118Z",
     "start_time": "2025-06-20T05:48:24.041311Z"
    }
   },
   "source": [
    "# 异常值的索引\n",
    "index = df2[cond].index\n",
    "index"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index([  25,  112,  255,  485,  708,  824,  893,  928, 1059, 1333, 1698, 1842,\n",
       "       1877, 2087, 2312, 2450, 2704, 2725, 2726, 2761, 2764, 2859, 2883, 2999,\n",
       "       3174, 3223, 3225, 3278, 3301, 3352, 3886, 4163, 4926, 5060, 5075, 5322,\n",
       "       5340, 5350, 5394, 5447, 5526, 5633, 5780, 6050, 6212, 6224, 6371, 6375,\n",
       "       6457, 6477, 6651, 6724, 6729, 6801, 6830, 6904, 6910, 6918, 6976, 7015,\n",
       "       7272, 7520, 7659, 7846, 7905, 8147, 8387, 8656, 8841, 8980, 9028, 9184,\n",
       "       9534, 9654],\n",
       "      dtype='int64')"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 29
  },
  {
   "cell_type": "code",
   "id": "5e530f81-cc41-464b-aba5-d0f1b4ef01ca",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:24.077966Z",
     "start_time": "2025-06-20T05:48:24.067781Z"
    }
   },
   "source": [
    "# 依据行索引删除异常值\n",
    "df2.drop(index=index)"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "             0         1         2\n",
       "0    -1.170869 -0.003572  0.641649\n",
       "1    -0.782275 -1.015805 -0.866671\n",
       "2     0.086411  1.582896 -1.368608\n",
       "3     1.319277 -1.072977 -0.619248\n",
       "4     0.710679 -1.026320 -0.401833\n",
       "...        ...       ...       ...\n",
       "9995  0.039600  1.374489 -0.350555\n",
       "9996  0.866373  2.376633 -0.192814\n",
       "9997 -0.563033 -1.212696 -0.111150\n",
       "9998 -0.209867  1.462870 -1.271430\n",
       "9999  0.455442 -0.281785  0.341316\n",
       "\n",
       "[9926 rows x 3 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.170869</td>\n",
       "      <td>-0.003572</td>\n",
       "      <td>0.641649</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.782275</td>\n",
       "      <td>-1.015805</td>\n",
       "      <td>-0.866671</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.086411</td>\n",
       "      <td>1.582896</td>\n",
       "      <td>-1.368608</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.319277</td>\n",
       "      <td>-1.072977</td>\n",
       "      <td>-0.619248</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.710679</td>\n",
       "      <td>-1.026320</td>\n",
       "      <td>-0.401833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>0.039600</td>\n",
       "      <td>1.374489</td>\n",
       "      <td>-0.350555</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>0.866373</td>\n",
       "      <td>2.376633</td>\n",
       "      <td>-0.192814</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>-0.563033</td>\n",
       "      <td>-1.212696</td>\n",
       "      <td>-0.111150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>-0.209867</td>\n",
       "      <td>1.462870</td>\n",
       "      <td>-1.271430</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>0.455442</td>\n",
       "      <td>-0.281785</td>\n",
       "      <td>0.341316</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>9926 rows × 3 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 30
  },
  {
   "cell_type": "code",
   "id": "c833a091-35c0-4bb5-bdb5-8d9e861f1118",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-06-20T05:48:24.132431Z",
     "start_time": "2025-06-20T05:48:24.127676Z"
    }
   },
   "source": [],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
